Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)
To run this notebook reproducibly, follow these steps:
In [ ]:
g_dataset_name = "Notebook7Test"
g_prepped_counts_run_prefix = "TestSet7"
g_prepped_counts_dir = '~/dual_crispr/test_data/test_set_7'
g_min_count_limit = 10 #Note: in absolute counts, not log2
g_max_fraction_acceptable_spline_density_diff = 0.02 # % of diff between max spline and min density
g_max_fraction_counts_excluded = 0.95 # any threshold throwing out >x% of counts is not acceptable
g_thresholds_run_prefix = ""
g_thresholds_dir = '~/dual_crispr/test_outputs/test_set_7'
In [ ]:
import inspect
import ccbb_pyutils.analysis_run_prefixes as ns_runs
import ccbb_pyutils.files_and_paths as ns_files
import ccbb_pyutils.notebook_logging as ns_logs
def describe_var_list(input_var_name_list):
description_list = ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
return "".join(description_list)
ns_logs.set_stdout_info_logger()
In [ ]:
g_prepped_counts_dir = ns_files.expand_path(g_prepped_counts_dir)
g_thresholds_run_prefix = ns_runs.check_or_set(g_thresholds_run_prefix, ns_runs.generate_run_prefix(g_dataset_name))
g_thresholds_dir = ns_files.expand_path(g_thresholds_dir)
print(describe_var_list(['g_prepped_counts_dir', 'g_thresholds_run_prefix', 'g_thresholds_dir']))
ns_files.verify_or_make_dir(g_thresholds_dir)
In [ ]:
%load_ext rpy2.ipython
In [ ]:
from rpy2.robjects import r
import rpy2.robjects as robjects
gR = robjects.r
In [ ]:
import dual_crispr.scoring_prep as ns_prep
print(inspect.getsource(ns_prep.get_prepped_file_suffix))
In [ ]:
import dual_crispr.construct_file_extracter as ns_extractor
print(inspect.getsource(ns_extractor.get_construct_header))
print(inspect.getsource(ns_extractor.get_potential_annotation_headers))
In [ ]:
import pandas
def get_prepped_counts_only_df(prepped_counts_dir, prepped_counts_run_prefix):
prepped_counts_suffix = ns_prep.get_prepped_file_suffix()
prepped_counts_fp = ns_files.build_multipart_fp(prepped_counts_dir, [prepped_counts_run_prefix,
prepped_counts_suffix])
prepped_counts_df = pandas.read_table(prepped_counts_fp, index_col=ns_extractor.get_construct_header())
total_headers = list(prepped_counts_df.columns.values)
unwanted_headers = ns_extractor.get_potential_annotation_headers()
count_headers = [x for x in total_headers if x not in unwanted_headers]
return prepped_counts_df.loc[:, count_headers]
In [ ]:
g_prepped_counts_only_df = get_prepped_counts_only_df(g_prepped_counts_dir, g_prepped_counts_run_prefix)
g_prepped_counts_only_df
In [ ]:
# 'temp' assignments suppress printing of cruft stdout
temp = gR.assign('gPreppedCountsDf', g_prepped_counts_only_df)
temp = gR.assign('gMinCountLimit', g_min_count_limit)
temp = gR.assign('gMaxFractionAcceptableSplineDensityDiff', g_max_fraction_acceptable_spline_density_diff)
temp = gR.assign('gMaxFractionCountsExcluded', g_max_fraction_counts_excluded)
In [ ]:
%%R
options(jupyter.plot_mimetypes = c("text/plain", "image/png" ))
options(repr.plot.width=7, repr.plot.height=7)
options(digits=10)
In [ ]:
%%R
gManualColor = "black"
gMinimumsColor = "orange"
gMaximumsColor = "turquoise"
gHistogramColor = "blue"
gDensityColor = "lightgreen"
gSplineColor = "lightpink"
gChosenColor = "red"
gSpar = NULL
extremumIsAcceptable<-function(putativeThreshold, hist, maxCountFractionExcluded){
result = FALSE
if (!is.null(putativeThreshold)) {
fractionCountsExcluded = getFractionCountsExcluded(hist, putativeThreshold,
maxCountFractionExcluded)
result = fractionCountsExcluded<maxCountFractionExcluded
}
return(result)
}
getFractionCountsExcluded<-function(hist, putativeThreshold, maxCountFractionExcluded){
tempHistDf = data.frame(mids=hist$mids, counts=hist$counts)
eligibleHistDf = tempHistDf[which(hist$mids<putativeThreshold), ]
result = sum(eligibleHistDf$counts)/sum(hist$counts)
return(result)
}
findExtremaIndices<-function(objWithXandY, getMins=TRUE){
relevantDiff = if(getMins==TRUE) 2 else -2
indicesOfExtrema = which(diff(sign(diff(objWithXandY$y)))==relevantDiff)+1
return(indicesOfExtrema)
}
getlog2CountsAndFreqsAtExtrema<-function(densityObj, indicesOfExtrema){
log2CountsAtExtrema = densityObj$x[indicesOfExtrema]
densityFunc = approxfun(densityObj)
freqsAtExtrema = densityFunc(log2CountsAtExtrema)
result = data.frame(log2CountsAtExtrema, freqsAtExtrema)
result = result[with(result, order(log2CountsAtExtrema)), ]
return(result)
}
# general concept: identify the peak of the "main distribution", then look for the lowest point in the
# "valley" between it and the noise spike at the low end of the counts histogram.
# Not all count distributions have this general shape; for all known cases that don't, this method
# will return NULL (rather than just silently picking a bad threshold).
findSmallestMinLeftOfMax<-function(splineWithXandY, minCountLimit, hist, maxCountFractionExcluded){
minLog2CountThreshold = log2(minCountLimit)
result = NULL # assume failure
#look for row indices of local interior maxima and local interior minima in input spline curve
indicesOfMaxes = findExtremaIndices(splineWithXandY, FALSE)
indicesOfMins = findExtremaIndices(splineWithXandY, TRUE)
# give up if there aren't at least one of each; otherwise
if (length(indicesOfMaxes)>0 & length(indicesOfMins)>0){
# get x and y values in the rows representing the local interior maxima
xAndYatMaxesDf = getlog2CountsAndFreqsAtExtrema(splineWithXandY, indicesOfMaxes)
eligibleMaxesDf = xAndYatMaxesDf[which(
xAndYatMaxesDf$log2CountsAtExtrema >= minLog2CountThreshold), ]
# if there are no local interior maxima at x values gte the global minimum allowed, give up; otherwise
if (nrow(eligibleMaxesDf)>0){
# pick the x position of the eligible local interior maximum with the largest y value
chosenMaxDf = eligibleMaxesDf[which(
eligibleMaxesDf$freqsAtExtrema == max(eligibleMaxesDf$freqsAtExtrema)), ]
rightmostLog2Count = chosenMaxDf$log2CountsAtExtrema[1]
# get x and y values in the rows representing the local interior minima
xAndYatMinsDf = getlog2CountsAndFreqsAtExtrema(splineWithXandY, indicesOfMins)
eligibleMinsDf = xAndYatMinsDf[which(
xAndYatMinsDf$log2CountsAtExtrema >= minLog2CountThreshold &
xAndYatMinsDf$log2CountsAtExtrema < rightmostLog2Count), ]
# if there are no local interior minima with x values gte the global minimum allowed and
# lt the x position of the chosen maximum, give up; otherwise
if (nrow(eligibleMinsDf)>0){
# pick the x position of the eligible local interior minimum with the smallest y value
chosenMinDf = eligibleMinsDf[which(
eligibleMinsDf$freqsAtExtrema == min(eligibleMinsDf$freqsAtExtrema)), ]
putativeResult = chosenMinDf$log2CountsAtExtrema
# Only known situation where above logic picks a bad threshold is when all "real"
# data is monotonically decreasing but there is (at least one) minute local maximum
# in the noise at far right of the count distribution; extremumIsAcceptable sanity-checks
# for that pathological case.
if (extremumIsAcceptable(putativeResult, hist, maxCountFractionExcluded)){
result = putativeResult
}
}
}
}
return(result)
}
# helper for findSplineAndDensityNearPoint
makeSplineAndDensityDf<-function(scaledDensityXandY, splineXandY){
# Determine spline and (scaled) density y values at shared set of x values
# where neither is NA, then calculate difference between spline and density
# at each of those points.
splineFunc = approxfun(splineXandY)
splineYAtDensityX = splineFunc(scaledDensityXandY$x)
result = data.frame(x=scaledDensityXandY$x, splineY=splineYAtDensityX,
densityY=scaledDensityXandY$y)
result = na.omit(result)
result$y = result$splineY-result$densityY
return(result)
}
# helper for findSplineAndDensityNearPoint
getNearnessThreshold<-function(splineAndDensityDf, maxSplineDensityDiff){
# Get global maximum value of spline function at any x
# Get global minimum of scaled density function at any x
# NB: x for max and min need not (and usually won't) be the same
# Use these values to find the maximum difference between spline
# and scaled density y values (regardless of x), then define
# "near" to be when spline and scaled density y values for same x get within
# the specified arbitrary fraction of that difference.
maxSplineY = max(splineAndDensityDf$splineY)
minDensityY = min(splineAndDensityDf$densityY)
maxDiff = maxSplineY - minDensityY
result = maxDiff * maxSplineDensityDiff
return(result)
}
# general concept: find the leftmost point (greater than the global minimum allowed)
# in the count distribution where the scaled density curve and the spline curve are
# within the global arbitrary threshold of one another.
# This gives worse results than findSmallestMinLeftOfMax on "good" count distributions,
# so it isn't the first-choice approach, but it makes a good fall-back for count
# distributions (especially noisy or low-signal ones) where findSmallestMinLeftOfMax
# fails to find a threshold. Fails to find a threshold ONLY in cases where
# spline and density curve never get "near" each other over range of
# counts in the underlying count distribution.
findSplineAndDensityNearPoint<-function(scaledDensityXandY, splineXandY, minCountLimit,
maxFractionAcceptableSplineDensityDiff, hist, maxCountFractionExcluded){
log2minCountLimit = log2(minCountLimit)
maxSplineDensityDiff = maxFractionAcceptableSplineDensityDiff
result = NULL # assume failure
splineAndDensityDf = makeSplineAndDensityDf(scaledDensityXandY, splineXandY)
nearnessThreshold = getNearnessThreshold(splineAndDensityDf, maxSplineDensityDiff)
# if there are no records whose x positions are gte the global minimum allowed,
# give up; otherwise
eligibleSplineAndDensityDf = splineAndDensityDf[which(
splineAndDensityDf$x >= log2minCountLimit), ]
if (nrow(eligibleSplineAndDensityDf)>0){
# Walk through all eligible x positions, from smallest toward largest.
# Assuming you don't get lucky and just find an x value right on the threshold,
# find the pair of x positions (if any such exist) that bracket the
# spot where the spline and density curves get "near enough" to each other.
# Return the point half-way between these two x positions (note that this is
# obviously a punt--I *could* do numerical approximation to find it, or
# set up a function that reached zero when the spline and density were
# "near enough" and then optimize it, but frankly it just doesn't seem
# worth the trouble ...)
putativeResult = NULL
largestXgtThresh = NULL
smallestXltThresh = NULL
for (i in 1:nrow(eligibleSplineAndDensityDf)){
currYval = eligibleSplineAndDensityDf$y[i]
currXval = eligibleSplineAndDensityDf$x[i]
if (currYval == nearnessThreshold) {
putativeResult = currXval
break
} else if (currYval <= nearnessThreshold) {
smallestLtThresh = currXval
if (is.null(largestXgtThresh)) {
putativeResult = smallestLtThresh
} else {
putativeResult = (smallestLtThresh - largestXgtThresh)/2 + largestXgtThresh
}
break
} else { # (currYval > nearnessThreshold)
largestXgtThresh = currXval
}
}
if (extremumIsAcceptable(putativeResult, hist, maxCountFractionExcluded)){
result = putativeResult
}
}
return(result)
}
analyzeCountsDist<-function(log2countsDfForSingleSample, rangeObj, minCountLimit, maxCountFractionExcluded,
maxFractionAcceptableSplineDensityDiff){
resultSummary = "No acceptable threshold found." # assume failure
rge<-rangeObj
increment = 0.05
log2CurrCountsHist<-hist(log2countsDfForSingleSample,
breaks=seq(0-increment,rge[2]+increment,by=increment),
plot=FALSE)
# density curve
scaleFactor = sum(log2CurrCountsHist$counts)*increment
log2CurrCountsDensity<-density(log2countsDfForSingleSample)
scaledLog2CurrCountsDensityDf = data.frame(x=log2CurrCountsDensity$x,
y=log2CurrCountsDensity$y*scaleFactor)
# smoothing spline curve of non-zero freqs only
log2CurrCountsHistXandY = data.frame(x=log2CurrCountsHist$mids, y=log2CurrCountsHist$count)
nonZeroLog2CurrCountsHistXandY = log2CurrCountsHistXandY[which(log2CurrCountsHistXandY$y>0), ]
log2CurrCountsSpline = smooth.spline(nonZeroLog2CurrCountsHistXandY$x, nonZeroLog2CurrCountsHistXandY$y)
# threshold selection
putativeThreshold = findSmallestMinLeftOfMax(log2CurrCountsSpline, minCountLimit,
log2CurrCountsHist, maxCountFractionExcluded)
if (!is.null(putativeThreshold)){
resultSummary = "Smallest-local-minimum-in-valley method used."
} else {
putativeThreshold = findSplineAndDensityNearPoint(scaledLog2CurrCountsDensityDf, log2CurrCountsSpline,
minCountLimit, maxFractionAcceptableSplineDensityDiff, log2CurrCountsHist, maxCountFractionExcluded)
if (!is.null(putativeThreshold)){
resultSummary = "Near-point-of-spline-and-density method used."
}
}
result = list(threshold = putativeThreshold, resultSummary=resultSummary,
histogram=log2CurrCountsHist,
scaledDensity=scaledLog2CurrCountsDensityDf, spline=log2CurrCountsSpline)
return(result)
}
drawAnalyzedCountsDist<-function(sampleName, rangeObj, analysisResult, manualThreshold=NULL){
rge<-rangeObj
xPositions = seq(from = 0, to = ceiling(rge[2])+1, by = 1)
xLabels = 2^(xPositions)
titleText = paste0(sampleName,"\n", analysisResult$resultSummary)
hist = analysisResult$histogram
plot(hist,
col=gHistogramColor,
border=FALSE,
main=titleText,
xaxt = 'n',
xlab=""
)
axis(side = 1, at = xPositions, labels=xLabels, las=2)
mtext("counts (pseudocount added to zeros only)", side=1, line=3)
# density curve
lines(analysisResult$scaledDensity,col=gDensityColor)
# smoothing spline curve of non-zero freqs only
lines(analysisResult$spline, col=gSplineColor)
# rug plot of manual threshold, if any
if (!is.null(manualThreshold)){
rug(manualThreshold, col=gManualColor, lwd=3)
}
# vertical line of selected threshold, if any
analysisThreshold = analysisResult$threshold
if (!is.null(analysisThreshold)){
abline(v=analysisThreshold, col=gChosenColor)
fractionExcludedCounts = getFractionCountsExcluded(analysisResult$histogram,
analysisThreshold, maxCountFractionExcluded)
percentExcludedCounts = fractionExcludedCounts*100
title(sub=paste0(format(round(percentExcludedCounts, 1), nsmall = 1), "% of counts excluded"))
}
}
analyzeAndDrawCountsDists<-function(multiSampleCountsDf, minCountLimit, maxCountFractionExcluded,
maxFractionAcceptableSplineDensityDiff, manualThresholds=NULL){
resultDf = data.frame(sampleName = character(0), log2CountsThresh = numeric(0));
multiSampleCountsDf[multiSampleCountsDf==0]<-1 #pseudocounts
log2MultiSampleCountsDf = log2(multiSampleCountsDf)
rangeObj = range(log2MultiSampleCountsDf)
for (i in 1:ncol(multiSampleCountsDf)) {
currSampleName = colnames(multiSampleCountsDf)[i]
log2countsDfForSingleSample = log2MultiSampleCountsDf[, i]
analysisResult = analyzeCountsDist(log2countsDfForSingleSample, rangeObj,
minCountLimit, maxCountFractionExcluded, maxFractionAcceptableSplineDensityDiff)
outputThreshold = if (is.null(analysisResult$threshold)) NA else analysisResult$threshold
resultDf = rbind(resultDf, data.frame(sampleName=currSampleName, log2CountsThresh=outputThreshold))
currManualThreshold = NULL
if (!is.null(manualThresholds)){
if (length(manualThresholds)>=i){
currManualThreshold = log2(manualThresholds[i])
}
}
drawAnalyzedCountsDist(currSampleName, rangeObj, analysisResult, currManualThreshold)
}
return(resultDf)
}
In [ ]:
%%R
gThresholdsDf = analyzeAndDrawCountsDists(gPreppedCountsDf, gMinCountLimit, gMaxFractionCountsExcluded,
gMaxFractionAcceptableSplineDensityDiff)
In [ ]:
%R gThresholdsDf
In [ ]:
if gR['gThresholdsDf'].isnull().values.any():
raise RuntimeError("Automated abundance threshold selection was not able to identify thresholds for all samples.")
In [ ]:
import dual_crispr.scoring_prep as ns_prep
print(inspect.getsource(ns_prep.get_sample_name_header))
print(inspect.getsource(ns_prep.get_abundance_thresh_header))
print(inspect.getsource(ns_prep.get_abundance_thresh_file_suffix))
In [1]:
def write_thresholds_file(thresholds_df, run_prefix, output_dir):
thresholds_df.columns = [ns_prep.get_sample_name_header(), ns_prep.get_abundance_thresh_header()]
output_fp = ns_files.build_multipart_fp(output_dir, [run_prefix, ns_prep.get_abundance_thresh_file_suffix()])
try:
thresholds_df.to_csv(output_fp, index=False, sep='\t')
except AttributeError: # if there is no to_csv method
thresholds_df.to_csvfile(output_fp, row_names=False, sep='\t')
In [ ]:
write_thresholds_file(gR['gThresholdsDf'], g_thresholds_run_prefix, g_thresholds_dir)
In [ ]:
print(ns_files.check_file_presence(g_thresholds_dir, g_thresholds_run_prefix,
ns_prep.get_abundance_thresh_file_suffix(),
check_failure_msg="Abundance threshold selection failed to produce an output file.")
)